import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_pacf
import statsmodels.api as sm
import statsmodels.tsa as tsa
from statsmodels.regression.linear_model import OLS
import os, sys
path = os.path.abspath(os.path.join('..'))
if path not in sys.path:
sys.path.append(path)
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV, LogisticRegression, LinearRegression
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier, DummyRegressor
from eptest_main import read_ep_data, get_ct_dfs, adjust_ct_dfs, get_rolling_cts, get_interp_swaps
from eptest_main import test_adfuller, RegressionMain, get_norm_cts, get_norm_curve_diffs, get_daily_curve_diffs
from eptest_main import CT_SWAP_MAP, CT_SIZES, COT_COLS, PRICE_DUR_COLS
from Modelling.classification import run_cv, run_cv_weighted, display_cv_metrics, display_is_metrics, display_oos_metrics
from Research.signals import lookback_zscore, zscore_signal, zscore_sizing, generate_pnl, generate_mtm_pnl, generate_pnl_index,\
generate_perf_summary, draw_signal_response, generate_exposures
# read the excel data and generate interpolated swaps to use for WN and US contracts
legacy, tff, futures, swaps_raw = read_ep_data()
swaps = get_interp_swaps(swaps_raw)
swaps.head()
idx = pd.IndexSlice
futures.loc[:, idx[:, 'PX_LAST']].tail()
# look at historical CTD mty?
# wish i could get invoice spreads data more easily
# break the data as of Tuesday measurement for initial modeling purposes
ct_dfs = get_ct_dfs(legacy, tff, futures, swaps)
ct = 'US'
ct_dfs[ct].head()
fig, ax = plt.subplots(figsize=(8,8), nrows=2)
sns.heatmap(ct_dfs[ct].loc[:, ['AM', 'LevFunds', 'Dealers', 'OtherRep', 'NonCom']].diff(13).corr(), annot=True, fmt='.2f', cmap='RdYlGn', ax=ax[1])
ct_dfs[ct].loc[:, ['AM', 'LevFunds', 'Dealers', 'OtherRep', 'NonCom', 'OPEN_INT']].rolling(52).mean().plot(ax=ax[0])
ax[0].legend(loc=1)
ax[0].set_title(ct)
plt.show()
# run PCA see how much variance we can explain by one or two components
is_end_dt = dt.date(2018,1,1)
cols = ['AM', 'LevFunds', 'Dealers', 'NonCom']
ct = 'TY'
prepca_data = ct_dfs[ct].loc[:is_end_dt, cols].asfreq('W-TUE', method='ffill').diff(1).dropna(how='any')
prepca_data = prepca_data.subtract(prepca_data.mean()).divide(prepca_data.std())
prepca_data = prepca_data.dropna(how='any')
pca = PCA()
pca.fit(prepca_data)
pc_cols = ['PC'+str(i+1) for i in np.arange(pca.n_components_)]
exp_var = pd.DataFrame(zip(pca.explained_variance_ratio_, pc_cols),
columns=['Expl. Var.','PC'])
scores = pd.DataFrame(pca.fit_transform(prepca_data), index=prepca_data.index, columns=pc_cols)
loadings = pd.DataFrame(pca.components_, index=['PC'+str(i) for i in np.arange(1,pca.n_components_+1)], columns=cols)
integrated_scores = scores.cumsum()
fig0, ax0 = plt.subplots(figsize=(10,4),ncols=2)
exp_var.plot.bar(x='PC', y='Expl. Var.', ax=ax0[0])
ax0[1].scatter(scores['PC1'].values, scores['PC2'].values, s=20, marker="x")
ax0[1].set_xlim(-8,8)
ax0[1].set_ylim(-8,8)
ax0[1].set_xlabel('PC1 Score')
ax0[1].set_ylabel('PC2 Score')
fig0.suptitle(ct, fontsize=14)
fig0.tight_layout()
fig0.subplots_adjust(top=0.9)
fig, ax = plt.subplots(figsize=(12,8))
integrated_scores.plot(ax=ax)
ax.set_title('PCA Scores Aggregated')
plt.show()
print('Loadings Matrix:')
loadings.round(3)
# function to compute dv01 weighted values and shift CoT data 3d forward after
adj_ct_dfs = adjust_ct_dfs(ct_dfs, swaps, oi_avg_len=125)
# resample on Friday's with 'window' sized rolling weekly differences
feat_adj = '_dv01'
window = 13
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=window, swap_chg_lags=[1,2])
is_end_dt = dt.date(2018,1,1)
# determine stationarity of features
# look at correlations
# - Only worth looking at one of Com/NonCom since they're basically just diff sign
corr_feats = ['AM_dv01', 'LevFunds_dv01', 'Dealers_dv01', 'NonCom_dv01']
for k, df in r.items():
print(k, test_adfuller(df, maxlag=1))
fig, ax = plt.subplots(figsize=(8,12), nrows=3, ncols=2, dpi=300)
for i, (k, df) in enumerate(r.items()):
col, row = i%2, i//2
swap_name = CT_SWAP_MAP[k][0]
data = df.loc[:is_end_dt, np.hstack([corr_feats, swap_name])].rename({swap_name: 'swap'}, axis=1).corr()
sns.heatmap(data, annot=True, fmt='.2f', cmap='RdYlGn', ax=ax[row][col])
ax[row][col].set_title(k)
ax[row][col].set_yticklabels(ax[row][col].get_yticklabels(), rotation = 60, fontsize = 8)
#ax[row][col].set_xticklabels(ax[row][col].get_xticklabels(), rotation = 30, fontsize = 8)
fig.tight_layout()
plt.show()
# retrieve the data and fit simple OLS on TY data + swap
adj_ct_dfs = adjust_ct_dfs(ct_dfs, swaps, oi_avg_len=125, adj_rpt=False)
ct = 'TY'
swap = CT_SWAP_MAP[ct][0]
is_end_dt = dt.date(2018,1,1)
unadj_feats = ['AM', 'LevFunds', 'Dealers', 'NonCom']
feat_adj = '_dv01'
feats = [x+feat_adj for x in unadj_feats]
# i also looked at PC1-3, not particularly more convincing
#feats = ['PC1', 'PC2', 'PC3']
window = 1
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=window, swap_chg_lags=[1,2], resample_per='W-TUE')
data_clean = r[ct].merge(scores, left_index=True, right_index=True).dropna(how='any')
y = data_clean[swap]
X = data_clean.loc[:is_end_dt, np.hstack([feats, swap+'_lag'+str(window)])]
mod = sm.OLS(y, sm.add_constant(X))
res = mod.fit()
res.summary()
# Since we have heavy multicollinearity we need to use some regularization to shrink the betas
# Use Lasso since it will help us with feature selection too
is_start_dt = dt.date(2010, 1, 1)
ct = 'TY'
window = 1
unadj_feat = ['AM', 'LevFunds', 'Dealers', 'NonCom']
model = Lasso()
swap = CT_SWAP_MAP[ct][0]
target = swap
num_features = np.hstack([[a+feat_adj for a in unadj_feat], swap+'_lag'+str(window)])
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=window, swap_chg_lags=[1,2], resample_per='W-TUE')
search, output_df = run_cv(r[ct].dropna(how='any'), ct, feat_adj, is_start_dt, num_features, target,
tss_splits=5, model=model, model_type='Regression', hyperparam_name='alpha',
scorer=None, sample_weight=None, max_train_size=208)
print(r'TY {0}wk changes'.format(window))
display_cv_metrics(search, 'alpha', 'r^2', log_scale=True)
print(output_df[['best_score', 'alpha', 'best_score_ratio']])
pd.Series(output_df['beta'], name='beta', index=num_features)
# Plot the model predictions versus actual changes
ty_data = r[ct].dropna(how='any')
X = ty_data.loc[:, num_features]
y = ty_data.loc[:, target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)
pred = pd.Series(search.predict(X_train), index=X_train.index).rename('pred')
fig, ax = plt.subplots()
pred.plot()
y_train.plot()
ax.legend()
ax.set_title('TY Model Predictions vs 1wk Actual Swap Chgs')
plt.show()
# Now we do the same thing, with the target variable some forward period of swap rate changes
ct = 'TY'
window = 5
fwd_per = 20
unadj_feat = ['AM', 'LevFunds']
model = Lasso()
swap = CT_SWAP_MAP[ct][0]
target = swap+'_fwd'+str(fwd_per)
num_features = np.hstack([[a+feat_adj for a in unadj_feat], swap, swap+'_lag'+str(window)])
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=window, swap_chg_lags=[1,2],
swap_chg_fwds=[1,5,20,60], resample_per='W-FRI')
search, output_df = run_cv(r[ct].dropna(how='any'), ct, feat_adj, is_start_dt, num_features, target,
tss_splits=5, model=model, model_type='Regression', hyperparam_name='alpha',
scorer=None, sample_weight=None, max_train_size=208)
print(r'TY {0}wk changes vs {1}d fwd swap changes'.format(window, fwd_per))
display_cv_metrics(search, 'alpha', 'r^2', log_scale=True)
print(output_df[['best_score', 'alpha', 'best_score_ratio']])
pd.Series(output_df['beta'], name='beta', index=num_features)
# Predictions again
ty_data = r[ct].dropna(how='any')
X = ty_data.loc[:, num_features]
y = ty_data.loc[:, target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)
pred = pd.Series(search.predict(X_train), index=X_train.index).rename('pred')
fig, ax = plt.subplots()
pred.plot()
y_train.plot()
ax.legend()
ax.set_title('TY 5-wk Model Predictions vs Fwd 5d Swap Chgs')
plt.show()
from eptest_main import gen_subseq_returns, analyze_sub_rets, plot_subsq_rets
# crossvalidation to evaluate if theres anything here outright
# some parameters
is_start_dt = dt.date(2010, 1, 1)
feat_adj = '_dv01'
unadj_feat = ['LevFunds']
filter_feat = 'LevFunds_dv01'
z_thresh = 1.0
clf_params = {'model__class_weight': 'balanced', 'model__solver': 'lbfgs'}
subsq_len = 22
window = 13
outputs = {}
#data
# resample on Friday's with 'window' sized rolling weekly differences
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=window, swap_chg_lags=[1], swap_chg_fwds=[1])
normed_pos = get_norm_cts(r, swaps, 26, swap_chg_fwds=[1, 5, 10, 22, 44])
for ct in CT_SWAP_MAP.keys():
extr_corr_pos = normed_pos[ct]
extr_corr_pos = extr_corr_pos[abs(extr_corr_pos[filter_feat])>z_thresh]
swap = CT_SWAP_MAP[ct][0]
target = swap+'_fwd'+str(subsq_len)+'_sign'
target_unadj = swap+'_fwd'+str(subsq_len)
target_lag = swap#+'_lag'+str(window)
num_features = np.hstack([[a+feat_adj for a in unadj_feat], target_lag])
sample_weight = swap+'_fwd'+str(subsq_len)+'_abs'
output_df = run_cv_weighted(extr_corr_pos, ct, feat_adj, is_start_dt, num_features, target, sample_weight=sample_weight,
tss_splits=5, model=LogisticRegression(class_weight='balanced', solver='lbfgs'),
hyperparam_name='C', max_train_size=208, clf_params=clf_params)
outputs[ct] = output_df
res = pd.concat(outputs, axis=1)
res.loc['beta'] = res.loc['beta'].apply(lambda x: x.round(5))
res
# Precision/recall and TP/FP curves
# Confusion matrix
# More or fewer features doesn't matter basically a coin flip
ct = 'US'
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=window, swap_chg_lags=[1], swap_chg_fwds=[1])
normed_pos = get_norm_cts(r, swaps, 26, swap_chg_fwds=[1, 5, 10, 22, 44])
us_data = normed_pos[ct].dropna(how='any')
swap = CT_SWAP_MAP[ct][0]
unadj_feat = ['AM', 'Dealers', 'LevFunds', 'NonCom']
target_unadj = swap+'_fwd'+str(subsq_len)
target = target_unadj+'_sign'
target_lag = swap#+'_lag'+str(window)
num_features = np.hstack([[a+feat_adj for a in unadj_feat], target_lag])
X = us_data.loc[:, num_features]
y = us_data.loc[:, target]
model = LogisticRegression(class_weight='balanced', solver='lbfgs')
search, output_df = run_cv(us_data, ct, feat_adj, is_start_dt, num_features, target,
tss_splits=5, model=model, model_type='Classification', hyperparam_name='C',
scorer=None, sample_weight=None, max_train_size=208)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)
display_is_metrics(search, X_train, y_train)
# correlations filtered by extreme values
ct = 'US'
extr_corr_pos = normed_pos[ct]
extr_corr_pos = extr_corr_pos[abs(extr_corr_pos[filter_feat])>z_thresh]
swap = CT_SWAP_MAP[ct][0]
target_unadj = swap+'_fwd'+str(subsq_len)
target_lag = swap#+'_lag'+str(window)
target_lag2 = swap+'_lag'+str(window)
num_features = np.hstack([[a+feat_adj for a in unadj_feat], target_lag, target_lag2])
fig, ax = plt.subplots(figsize=(6,4), dpi=200)
sns.heatmap(extr_corr_pos.loc[:, np.hstack([num_features, target_unadj])].corr(), annot=True, fmt='.2f', cmap='RdYlGn', ax=ax)
plt.show()
outputs['TY'].loc['beta']
# look at the timeseries of betas under rolling model fits to see if there's consistency
sample_weight = swap+'_fwd'+str(subsq_len)+'_abs'
extr_corr_pos[target] = np.sign(extr_corr_pos[target_unadj])
extr_corr_pos[sample_weight] = np.abs(extr_corr_pos[target_unadj])
features = ['LevFunds_dv01', CT_SWAP_MAP[ct][0]]
datas = extr_corr_pos.loc[:, np.hstack([features, target, sample_weight])].dropna(how='any')
tss = TimeSeriesSplit(n_splits=5, max_train_size=204)
model = LogisticRegression(class_weight='balanced', solver='lbfgs')
betas = []
dates = []
is_end_dt = dt.date(2018,1,1)
for train_idx, test_idx in tss.split(datas.loc[:is_end_dt]):
X_train = datas.iloc[train_idx].loc[:, features]
y_train = datas.iloc[train_idx].loc[:, target]
w_train = datas.iloc[train_idx].loc[:, sample_weight]
model.fit(X_train, y_train, sample_weight=w_train)
betas.append(model.coef_.squeeze())
dates.append(extr_corr_pos.index[train_idx[-1]])
pd.DataFrame(betas, index=dates, columns=features)
is_start_dt = dt.date(2010, 1, 1)
is_end_dt = dt.date(2018,1,1)
end_data_dt = swaps.index[-1]
feat_adj = '_dv01'
filter_feat = 'LevFunds_dv01'
z_thresh = 1.0
windows = [1, 3, 8, 13]
ct = 'US'
swap = CT_SWAP_MAP[ct][0]
plot_subsq_rets(adj_ct_dfs, ct, feat_adj, filter_feat, windows, swaps=swaps, z_threshold=z_thresh, is_start_dt=is_start_dt,
is_end_dt=is_end_dt, zscore_lookback=52, max_len=60)
feat1, feat2 = features[0], features[1]
X1 = extr_corr_pos[np.hstack([features, sample_weight])].copy(deep=True)
y1 = extr_corr_pos[target].copy(deep=True)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.25, shuffle=False)
X_train2 = X_train1[features]
w_train2 = X_train1[sample_weight]
w_test1 = X_test1[sample_weight]
# plotting vars
x_min, x_max = X1[feat1].min()-0.25, X1[feat1].max()+0.25
y_min, y_max = X1[feat2].min()-0.25, X1[feat2].max()+0.25
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
feat_mesh = pd.DataFrame(np.c_[xx.ravel(), yy.ravel()], columns=[feat1, feat2])
# Create Preprocessor
num_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scale', StandardScaler(with_mean=True))
])
# Create and fit model
model = SVC(kernel='rbf', gamma='auto', class_weight='balanced')
#model = LogisticRegression(multi_class='auto', class_weight='balanced', solver='lbfgs')
pipe = Pipeline(steps=[
('preprocessor', num_transformer),
('model', model)
])
# get best regularization param from cross validation
tss = TimeSeriesSplit(n_splits=5, max_train_size=208)
param_space = np.logspace(-4, 4, 50)
search = GridSearchCV(pipe, {'model__C': param_space}, n_jobs=-1, cv=tss)
search.fit(X_train2, y_train1, model__sample_weight=w_train2)
if isinstance(pipe.named_steps['model'], SVC):
Z = search.decision_function(feat_mesh)
else:
Z = search.predict_proba(feat_mesh).T[1]
Z = Z.reshape(xx.shape)
fig, ax = plt.subplots(dpi=300)
contours = ax.contourf(xx, yy, Z, cmap='RdYlBu', alpha=0.8)
plt.colorbar(contours)
ax.scatter(X_train2[feat1], X_train2[feat2], c=y_train1, s=w_train2*100, cmap='RdYlBu')
ax.scatter(X_test1[feat1], X_test1[feat2], c=y_test1, s=w_test1*100, cmap='RdYlBu', marker='x')
ax.set_xlabel(feat1)
ax.set_ylabel(feat2)
ax.set_title('Radial Basis SVC Decision Space')
print(r'Best reg param C: {0}'.format(search.best_params_['model__C']))
print(r'Best score: {0}'.format(np.round(search.best_score_, 4)))
plt.show()
# Same deal with curves
from itertools import combinations
curves = ([c for c in combinations(CT_SIZES.keys(), 2)])
swap_chg_fwds = [1, 5, 10, 20, 60]
window = 8
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=window)
curve_diffs = get_norm_curve_diffs(curves, r, window, swaps, swap_chg_fwds=swap_chg_fwds)
daily_curves = get_daily_curve_diffs(curves, r, swaps)
daily_curves.head()
# resample on Friday's with 'window' sized rolling weekly differences
is_start_dt = dt.date(2010, 1, 1)
is_end_dt = dt.date(2018,1,1)
end_data_dt = swaps.index[-1]
feat_adj = '_dv01'
filter_feat = 'LevFunds_dv01'
z_thresh = 1.0
windows = [5, 13, 26]
curve = 'US-FV'
curve_swap_name = curve.split("-")
curve_swap = '-'.join([CT_SWAP_MAP[curve_swap_name[0]][0], CT_SWAP_MAP[curve_swap_name[1]][0]])
curve_swap
plot_subsq_rets(adj_ct_dfs, curve, feat_adj, filter_feat, windows, swaps=swaps, z_threshold=z_thresh, is_start_dt=is_start_dt,
is_end_dt=is_end_dt, swap_name=curve_swap, zscore_lookback=52, max_len=60, curve=True, daily_curves=daily_curves)
fig, ax = plt.subplots(figsize=(8,8), nrows=2)
curve_diffs['US-FV'].loc[:, 'AM_dv01'].plot(ax=ax[0])
daily_curves['swap_US-5y'].diff(30).shift(-30).plot(ax=ax[0], secondary_y=True)
sns.scatterplot(x=curve_diffs['US-FV'].loc[:, 'LevFunds_dv01'], y=daily_curves['swap_US-5y'].diff(30).shift(-30), ax=ax[1])
plt.show()
# parameters
pos_agg_window = 13 #weeks
trade_len = 40 #days
pt_sl = (3,0.25)
norm_window = 26 #weeks
start_dt = dt.date(2010,1,1)
end_dt = dt.date(2018,1,1)
feat_adj = '_dv01'
filter_feat = 'LevFunds_dv01'
z_thresh_positions = 1.0
z_thresh_swap = 0.1
ct = 'US'
swap = CT_SWAP_MAP[ct][0]
# get data
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=pos_agg_window, swap_chg_lags=[1], swap_chg_fwds=[1])
normed_pos = get_norm_cts(r, swaps, norm_window, swap_chg_fwds=[1])
signal_data = normed_pos[ct].loc[start_dt:end_dt]
signal_data[swap] = signal_data[swap].subtract(signal_data[swap].rolling(norm_window).mean()).\
divide(signal_data[swap].rolling(norm_window).std())
swap_data = swaps.loc[start_dt:end_dt, swap]
# Build the returns from approximate equity in 10 years of the swap rate
# Need this return series to take advantage of the code I had previously built
swap_data_10yr_risk = (1. + swap_data.diff(1).fillna(0) / 10.).cumprod()
# build signals
signals_feat = zscore_signal(signal_data[filter_feat], z_thresh_positions, signal_type='Reversion')
signals_trailingcurve = zscore_signal(signal_data[swap], z_thresh_swap, signal_type='Momentum')
signals = signals_trailingcurve.to_frame().join(signals_feat)
signals['composed'] = signals.apply(lambda x: 0 if np.sign(x[filter_feat])!=np.sign(x[swap]) else x[filter_feat], axis=1)
signals['composed'].value_counts()
events0, df0 = zscore_sizing(signals['composed'], swap_data, vertbar=trade_len, pt_sl=pt_sl, even_weight=True)
events = generate_pnl(events0, swap_data_10yr_risk, pct_change=True)
mtm_pnl = generate_mtm_pnl(events, swap_data_10yr_risk, log_diff=True)
pnl_index = generate_pnl_index(mtm_pnl, None)
fig, ax = plt.subplots()
pnl_index.plot(ax=ax)
ax.set_title('Equity Curve: 3z take profit, 0.25z stop-loss')
plt.show()
# Distribution of each trades pnl (not weighted by uniqueness, )
fig, ax = plt.subplots()
sns.distplot(events.ret*events.trgt*1000, ax=ax)
ax.set_title('Return hist (~bps)')
plt.show()
# dummy check
eventsCheck = events.merge(swaps[swap].rename('exit'), left_on='t1', right_index=True)
eventsCheck['entry'] = swaps[swap]
eventsCheck['pnlBps'] = (eventsCheck['exit'] - eventsCheck['entry']) * eventsCheck['side']*100
eventsCheck['TradeDays'] = (eventsCheck['t1']-eventsCheck.index)
fig, ax = plt.subplots()
sns.distplot(eventsCheck['pnlBps'], ax=ax, bins=16)
ax.set_title('Return hist (bps)')
plt.show()
# Performance summary stats
generate_perf_summary(events, swap_data_10yr_risk)
# Positions
exposures = generate_exposures(events, swap_data_10yr_risk)
fig, ax = plt.subplots()
exposures.rename('Exposure').plot(ax=ax)
signal_data[filter_feat].plot(ax=ax)
ax.set_title('Exposure vs Signal')
ax.legend()
plt.show
# Signal Response Plot
# Set thresholds to 0, so just the sign
# Used to evaluate whether larger signals are more valuable
# parameters
pos_agg_window = 13 #weeks
trade_len = 40 #days
pt_sl = (3,0.25)
norm_window = 26 #weeks
is_end_dt = dt.date(2018,1,1)
feat_adj = '_dv01'
filter_feat = 'LevFunds_dv01'
z_thresh_positions = 0.01
z_thresh_swap = 0.01
ct = 'US'
swap = CT_SWAP_MAP[ct][0]
# get data
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=pos_agg_window, swap_chg_lags=[1], swap_chg_fwds=[1])
normed_pos = get_norm_cts(r, swaps, norm_window, swap_chg_fwds=[1])
signal_data = normed_pos[ct]
signal_data[swap] = signal_data[swap].subtract(signal_data[swap].rolling(norm_window).mean()).\
divide(signal_data[swap].rolling(norm_window).std())
swap_data = swaps.loc[:is_end_dt, swap]
swap_data_10yr_risk = (1. + swap_data.diff(1).fillna(0) / 10.).cumprod()
# build signals
signals_feat = zscore_signal(signal_data[filter_feat], z_thresh_positions, signal_type='Reversion')
signals_trailingcurve = zscore_signal(signal_data[swap], z_thresh_swap, signal_type='Momentum')
signals = signals_trailingcurve.to_frame().join(signals_feat)
signals['composed'] = signals.apply(lambda x: 0 if np.sign(x[filter_feat])!=np.sign(x[swap]) else x[filter_feat], axis=1)
signals['composed'].value_counts()
events0, df0 = zscore_sizing(signals['composed'], swap_data, vertbar=trade_len, pt_sl=pt_sl, even_weight=True)
events = generate_pnl(events0, swap_data_10yr_risk, pct_change=True)
events['entry_score'] = signal_data[filter_feat]
draw_signal_response(events, notch=False)
# parameters
pos_agg_window = 5 #weeks
trade_len = 20 #days
pt_sl = (6,6)
norm_window = 26 #weeks
is_end_dt = dt.date(2021,1,1)
r = get_rolling_cts(adj_ct_dfs, feat_adj, window=pos_agg_window)
curve_diffs = get_norm_curve_diffs(curves, r, pos_agg_window, swaps, swap_chg_fwds=swap_chg_fwds, norm_window=norm_window)
daily_curves = get_daily_curve_diffs(curves, r, swaps)
filter_feat = 'LevFunds_dv01'
z_thresh_positions = 1.0
z_thresh_mom = 0.01
#windows = [3, 8, 13, 26]
curve = 'US-FV'
swap_curve = 'swap_US-5y'
signal_data = curve_diffs[curve].loc[:is_end_dt, [filter_feat, swap_curve]]
signal_data[swap_curve] = signal_data[swap_curve].subtract(signal_data[swap_curve].rolling(norm_window).mean()).\
divide(signal_data[swap_curve].rolling(norm_window).std())
swap_data = daily_curves.loc[:is_end_dt, swap_curve]
swap_data_10yr_risk = (1. + swap_data.diff(1).fillna(0) / 10.).cumprod()
# build signals
signals_feat = zscore_signal(signal_data[filter_feat], z_thresh_positions, signal_type='Reversion')
signals_trailingcurve = zscore_signal(signal_data[swap_curve], z_thresh_mom, signal_type='Momentum')
signals = signals_trailingcurve.to_frame().join(signals_feat)
signals['composed'] = signals.apply(lambda x: 0 if np.sign(x[filter_feat])!=np.sign(x[swap_curve]) else x[filter_feat], axis=1)
signals['composed'].value_counts()
events0, df0 = zscore_sizing(signals['composed'], swap_data, vertbar=trade_len, pt_sl=pt_sl, even_weight=True)
events = generate_pnl(events0, swap_data_10yr_risk, pct_change=False)
mtm_pnl = generate_mtm_pnl(events, swap_data_10yr_risk, log_diff=True)
pnl_index = generate_pnl_index(mtm_pnl, None)
fig, ax = plt.subplots()
pnl_index.plot(ax=ax)
ax.set_title('US-FV Curve Momentum Strategy on 5wk Relative LevFunds Z-Score Positioning Changes')
plt.show()
# Distribution of each trades pnl (not weighted by uniqueness, )
fig, ax = plt.subplots()
sns.distplot(events.ret*events.trgt*1000, ax=ax)
ax.set_title('Return hist (~bps)')
plt.show()
# Positions
exposures = generate_exposures(events, swap_data_10yr_risk)
fig, ax = plt.subplots()
exposures.plot(ax=ax)
ax.set_title('Exposure')
plt.show
# Performance summary stats
generate_perf_summary(events, swap_data_10yr_risk)
# dummy check
events_wpnl = events.merge(swap_data, left_index=True, right_index=True).merge(swap_data, left_on='t1',
right_index=True, suffixes=('_entry', '_exit'))
((events_wpnl['swap_US-5y_exit']-events_wpnl['swap_US-5y_entry']) * events_wpnl['side']).cumsum().plot()
- Check for general numerical issues
- US June 2015 contract had a delivery gap of ~5yrs relative to March15 due to tsy issuance in early 2000s
- see: https://www.cmegroup.com/trading/interest-rates/files/mar-15-jun-15-roll-analysis.pdf
- the CTD MM swap then gets closer to the 20y... consider interpolating
- Create offset series for Fridays, lets just shift it 3 days (data stamped tuesday but reported friday+ if holiday)
- Split train+cv and full test data. Test data lets do 2018+
Start evaluating training data visually, we need to work on stationary time series so let's do the transforms upfront:
- DV01-weighted positions or % OI will be more informative, dur*px/100*ct_size/10000
- Rolling x-week changes in transformed CoT data
- Consider PCA of this CoT data and look at consistency of loadings over time
- Rolling x-week changes in swap data
- Indicator variables of whether the CoT change was 'same-way' as market move
Let's do feature selection second- random forest type feature importance algos, lasso
Features need to be fit on a weekly basis- this data is weekly
# try something with many features
# get the data first
windows = [1, 5, 13]
norm_window = 26 #wks
fwd_per = 5 #days
fullwindow_dict = {}
dv01_ct_dfs = {}
for k, df in adj_ct_dfs.items():
swap = CT_SWAP_MAP[k][0]
dv01_ct_dfs[k] = df.loc[:, corr_feats].merge(swaps[swap].rename('swap'), left_index=True, right_index=True).copy(deep=True)
for x in windows:
j = pd.concat(dv01_ct_dfs, axis=1).asfreq('W-FRI', method='ffill').diff(x)
fullwindow_dict[str(x)] = j
full = pd.concat(fullwindow_dict, axis=1).swaplevel(0, 1, axis=1)
full.columns = pd.MultiIndex.from_tuples([(col[0], '_'.join(reversed(col[1:])).strip()) for col in full.columns.values])
full_norm = full.subtract(full.rolling(norm_window).mean())
full_norm = full_norm.divide(full_norm.rolling(norm_window).std())
# join swap diffs
swap_diffs = swaps.copy(deep=True).diff(fwd_per).shift(-fwd_per)
for ct in CT_SWAP_MAP.keys():
swap = CT_SWAP_MAP[ct][0]
full_norm[(ct, 'swap_fwd'+str(fwd_per))] = swap_diffs[swap]
# compute curve data
curve_diffs = {}
for curve in curves:
swap0 = CT_SWAP_MAP[curve[0]][0]
swap1 = CT_SWAP_MAP[curve[1]][0]
curve_diff = full_norm[curve[1]] - full_norm[curve[0]]
curve_name = curve[1] + '-' + curve[0]
curve_diffs[curve_name] = curve_diff
norm_curve_data = pd.concat(curve_diffs, axis=1)
norm_curve_data.dropna().head()
# run models
ct = 'TY-TU'
data_raw = norm_curve_data.copy(deep=True)
target_swap = 'swap_fwd' + str(fwd_per)
target = target_swap + '_sign'
target_weight = target_swap + '_abs'
data = data_raw[ct].dropna(how='any')
features = data.columns[data.columns != target_swap]
data[target] = data[target_swap].apply(np.sign)
data.loc[data[target]==0, target] = -1.
data[target_weight] = data[target_swap].apply(np.abs)
X = data.loc[:, np.hstack([features, target_weight])].copy(deep=True)
y = data.loc[:, target].copy(deep=True)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, shuffle=False)
X_train2 = X_train1[features]
X_test2 = X_test1[features]
w_train2 = X_train1[target_weight]
w_test1 = X_test1[target_weight]
# Create Preprocessor
num_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scale', StandardScaler(with_mean=True))
])
# Create and fit model
model = SVC(kernel='poly', gamma='auto', class_weight='balanced')
#model = LogisticRegression(multi_class='auto', class_weight='balanced', solver='lbfgs')
pipe = Pipeline(steps=[
('preprocessor', num_transformer),
('model', model)
])
# get best regularization param from cross validation
tss = TimeSeriesSplit(n_splits=5, max_train_size=208)
param_space = np.logspace(-4, 4, 50)
search = GridSearchCV(pipe, {'model__C': param_space}, n_jobs=-1, cv=tss)
search.fit(X_train2, y_train1, model__sample_weight=w_train2)
display_cv_metrics(search, 'C', 'accuracy', log_scale=True)
display_is_metrics(search, X_train2, y_train1)
display_oos_metrics(search, X_test2, y_test1)